import requests
import re
import csv
import xlsxwriter

domain = 'https://www.dytt89.com/'
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"
}
res = requests.get(domain,headers=headers,verify=False) # verify 安全验证给去掉了
res.encoding = 'gb2312' # 指定字符集
# print(res.text)

# 拿到ul里的li
obj1 = re.compile(r"2022必看热片.*?<ul>(?P<ul>.*?)</ul>",re.S)
obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S)
obj3 = re.compile(r'<title>.*?《(?P<move_name>.*?)》.*?</title>.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="'
                  r'(?P<download_url>.*?)"',re.S)

result1 = obj1.finditer(res.text)

child_href_list = []

for it in result1:
    ul = it.group('ul')

    # 提取子页面链接
    result2 = obj2.finditer(ul)
    for itt in result2:
        # 拼接子页面的url地址：域名 + 子页面地址
        child_href = domain + itt.group('href').strip("/")
        child_href_list.append(child_href) # 把子页面链接保存到一个列表

# f = open(r'D:\ui\reptile\data\data_movie.csv', 'w', encoding='utf-8')
# csvwriter = csv.writer(f)

workbook = xlsxwriter.Workbook(r'D:\ui\reptile\data\movie_data.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, '电影名称')  # 第i行0列
worksheet.write(0, 1, '迅雷链接') # 第i行1列

x = 0
# 提取子页面内容
for href in child_href_list:
    child_res = requests.get(href,headers=headers,verify=False)
    child_res.encoding = 'gb2312'
    result3 = obj3.search(child_res.text)
    dict = result3.groupdict()
    movie_name_s = dict.get('move_name')
    download_url_s = dict.get('download_url')
    # print(dict.get('move_name'))
    # for (_,value) in dict.items():
    #     a.append(value)
    # movie_values = dict.values()

    # print(dict.values())
    # print(f"电影名称：{result3.group('move_name')}")
    # print(f"迅雷链接：{result3.group('download_url')}")
    # csvwriter.writerow(dict.values())
    # break  # 测试用

    j = x + 1
    worksheet.write(j, 0, movie_name_s)  # 第i行0列
    worksheet.write(j, 1, download_url_s)
    x += 1

workbook.close()



res.close()

print('over!')